In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


data = pd.read_csv("D:\Customer Segmentasyon\sales_data_sample.csv", encoding="ISO-8859-1")
In [2]:
data.head(10)
Out[2]:
ORDERNUMBER QUANTITYORDERED PRICEEACH ORDERLINENUMBER SALES ORDERDATE STATUS QTR_ID MONTH_ID YEAR_ID ... ADDRESSLINE1 ADDRESSLINE2 CITY STATE POSTALCODE COUNTRY TERRITORY CONTACTLASTNAME CONTACTFIRSTNAME DEALSIZE
0 10107 30 95.70 2 2871.00 2/24/2003 0:00 Shipped 1 2 2003 ... 897 Long Airport Avenue NaN NYC NY 10022 USA NaN Yu Kwai Small
1 10121 34 81.35 5 2765.90 5/7/2003 0:00 Shipped 2 5 2003 ... 59 rue de l'Abbaye NaN Reims NaN 51100 France EMEA Henriot Paul Small
2 10134 41 94.74 2 3884.34 7/1/2003 0:00 Shipped 3 7 2003 ... 27 rue du Colonel Pierre Avia NaN Paris NaN 75508 France EMEA Da Cunha Daniel Medium
3 10145 45 83.26 6 3746.70 8/25/2003 0:00 Shipped 3 8 2003 ... 78934 Hillside Dr. NaN Pasadena CA 90003 USA NaN Young Julie Medium
4 10159 49 100.00 14 5205.27 10/10/2003 0:00 Shipped 4 10 2003 ... 7734 Strong St. NaN San Francisco CA NaN USA NaN Brown Julie Medium
5 10168 36 96.66 1 3479.76 10/28/2003 0:00 Shipped 4 10 2003 ... 9408 Furth Circle NaN Burlingame CA 94217 USA NaN Hirano Juri Medium
6 10180 29 86.13 9 2497.77 11/11/2003 0:00 Shipped 4 11 2003 ... 184, chausse de Tournai NaN Lille NaN 59000 France EMEA Rance Martine Small
7 10188 48 100.00 1 5512.32 11/18/2003 0:00 Shipped 4 11 2003 ... Drammen 121, PR 744 Sentrum NaN Bergen NaN N 5804 Norway EMEA Oeztan Veysel Medium
8 10201 22 98.57 2 2168.54 12/1/2003 0:00 Shipped 4 12 2003 ... 5557 North Pendale Street NaN San Francisco CA NaN USA NaN Murphy Julie Small
9 10211 41 100.00 14 4708.44 1/15/2004 0:00 Shipped 1 1 2004 ... 25, rue Lauriston NaN Paris NaN 75016 France EMEA Perrier Dominique Medium

10 rows × 25 columns

In [3]:
describe = data.describe()
describe
Out[3]:
ORDERNUMBER QUANTITYORDERED PRICEEACH ORDERLINENUMBER SALES QTR_ID MONTH_ID YEAR_ID MSRP
count 2823.000000 2823.000000 2823.000000 2823.000000 2823.000000 2823.000000 2823.000000 2823.00000 2823.000000
mean 10258.725115 35.092809 83.658544 6.466171 3553.889072 2.717676 7.092455 2003.81509 100.715551
std 92.085478 9.741443 20.174277 4.225841 1841.865106 1.203878 3.656633 0.69967 40.187912
min 10100.000000 6.000000 26.880000 1.000000 482.130000 1.000000 1.000000 2003.00000 33.000000
25% 10180.000000 27.000000 68.860000 3.000000 2203.430000 2.000000 4.000000 2003.00000 68.000000
50% 10262.000000 35.000000 95.700000 6.000000 3184.800000 3.000000 8.000000 2004.00000 99.000000
75% 10333.500000 43.000000 100.000000 9.000000 4508.000000 4.000000 11.000000 2004.00000 124.000000
max 10425.000000 97.000000 100.000000 18.000000 14082.800000 4.000000 12.000000 2005.00000 214.000000
In [4]:
#Checking NaN Variables
nan_check = pd.isnull(data).sum()
nan_check
Out[4]:
ORDERNUMBER            0
QUANTITYORDERED        0
PRICEEACH              0
ORDERLINENUMBER        0
SALES                  0
ORDERDATE              0
STATUS                 0
QTR_ID                 0
MONTH_ID               0
YEAR_ID                0
PRODUCTLINE            0
MSRP                   0
PRODUCTCODE            0
CUSTOMERNAME           0
PHONE                  0
ADDRESSLINE1           0
ADDRESSLINE2        2521
CITY                   0
STATE               1486
POSTALCODE            76
COUNTRY                0
TERRITORY           1074
CONTACTLASTNAME        0
CONTACTFIRSTNAME       0
DEALSIZE               0
dtype: int64
In [5]:
#ADDRESSLINE2, STATE, POSTALCODE, TERRITORY have NaN

null_variables = data.isnull().sum().sort_values(ascending = False)
null_rates = (data.isnull().sum()/data.isnull().count()).sort_values(ascending = False)
missing_data = pd.concat([null_variables,null_rates],axis = 1,keys=['N of null','Rates'])
In [7]:
missing_data
Out[7]:
N of null Rates
ADDRESSLINE2 2521 0.893022
STATE 1486 0.526390
TERRITORY 1074 0.380446
POSTALCODE 76 0.026922
ORDERNUMBER 0 0.000000
CUSTOMERNAME 0 0.000000
CONTACTFIRSTNAME 0 0.000000
CONTACTLASTNAME 0 0.000000
COUNTRY 0 0.000000
CITY 0 0.000000
ADDRESSLINE1 0 0.000000
PHONE 0 0.000000
PRODUCTCODE 0 0.000000
QUANTITYORDERED 0 0.000000
MSRP 0 0.000000
PRODUCTLINE 0 0.000000
YEAR_ID 0 0.000000
MONTH_ID 0 0.000000
QTR_ID 0 0.000000
STATUS 0 0.000000
ORDERDATE 0 0.000000
SALES 0 0.000000
ORDERLINENUMBER 0 0.000000
PRICEEACH 0 0.000000
DEALSIZE 0 0.000000
In [8]:
#ADDRESSLINE2, STATE, TERRITORY rates are too much to handle missing values

data = data.drop(["ADDRESSLINE2","STATE","TERRITORY"], axis=1)

#PHONE, ADRESSLINE1, POSTALCODE columns are significant

data= data.drop(["PHONE","ADDRESSLINE1","POSTALCODE"],axis=1)

# drop 'ORDERDATE' because we have YEAR ID, MONTH ID
data= data.drop(["ORDERDATE"],axis=1)

#drop 'STATUS', 'POSTALCODE', 'CITY', 
#'CONTACTFIRSTNAME', 'CONTACTLASTNAME', 'CUSTOMERNAME', 'ORDERNUMBER'
#They are not required for analysis
drops = ['STATUS', 'CITY','CONTACTFIRSTNAME',
         'CONTACTLASTNAME', 'CUSTOMERNAME', 'ORDERNUMBER']
data= data.drop(drops,axis=1)

data.head(6)
Out[8]:
QUANTITYORDERED PRICEEACH ORDERLINENUMBER SALES QTR_ID MONTH_ID YEAR_ID PRODUCTLINE MSRP PRODUCTCODE COUNTRY DEALSIZE
0 30 95.70 2 2871.00 1 2 2003 Motorcycles 95 S10_1678 USA Small
1 34 81.35 5 2765.90 2 5 2003 Motorcycles 95 S10_1678 France Small
2 41 94.74 2 3884.34 3 7 2003 Motorcycles 95 S10_1678 France Medium
3 45 83.26 6 3746.70 3 8 2003 Motorcycles 95 S10_1678 USA Medium
4 49 100.00 14 5205.27 4 10 2003 Motorcycles 95 S10_1678 USA Medium
5 36 96.66 1 3479.76 4 10 2003 Motorcycles 95 S10_1678 USA Medium
In [9]:
#Encoding categorical features

from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
data.loc[:, 'PRODUCTLINE'] = labelencoder.fit_transform(data.loc[:, 'PRODUCTLINE'])
data['COUNTRY'] = labelencoder.fit_transform(data['COUNTRY'])
data['DEALSIZE'] = labelencoder.fit_transform(data['DEALSIZE'])
data['PRODUCTCODE'] = labelencoder.fit_transform(data['PRODUCTCODE'])

data.head(4)
Out[9]:
QUANTITYORDERED PRICEEACH ORDERLINENUMBER SALES QTR_ID MONTH_ID YEAR_ID PRODUCTLINE MSRP PRODUCTCODE COUNTRY DEALSIZE
0 30 95.70 2 2871.00 1 2 2003 1 95 0 18 2
1 34 81.35 5 2765.90 2 5 2003 1 95 0 6 2
2 41 94.74 2 3884.34 3 7 2003 1 95 0 6 1
3 45 83.26 6 3746.70 3 8 2003 1 95 0 18 1
In [10]:
#Finding optimal number of clusters
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_data = scaler.fit_transform(data)

scores_1 = []

range_of_cluster = range(1,10)

for i in range_of_cluster:
    kmeans = KMeans(n_clusters=i)
    kmeans.fit(scaled_data)
    scores_1.append(kmeans.inertia_)

plt.plot(scores_1, 'r*-')
plt.xticks(np.arange(len(scores_1)), np.arange(1, len(scores_1) +1) )
plt.title('Elbow Method')
plt.xlabel('n of clusters')
plt.ylabel('Scores')
plt.show()
In [11]:
#Silhouette Score
from sklearn.metrics import silhouette_score
scores = []
for num_clusters in range(2,10):
    clusterer = KMeans(n_clusters=num_clusters)
    pred = clusterer.fit_predict(scaled_data)
    # centers = clusterer.cluster_centers_
    scores.append(silhouette_score(scaled_data, pred, metric='euclidean'))

plt.plot(scores, 'b*-')
plt.xticks(np.arange(len(scores)), np.arange(1, len(scores) +1) )
plt.title('Silhouette Score')
plt.xlabel('n of clusters')
plt.ylabel('Scores')
plt.show()
In [12]:
# I selected number of clusters as 5. (n = 5)

kmeans = KMeans(n_clusters=5,random_state=0)
kmeans.fit(scaled_data)

clusters_centers = pd.DataFrame(data=kmeans.cluster_centers_, columns=[data.columns])
y_kmeans = kmeans.fit_predict(scaled_data)

data_with_cluster = pd.concat([data, pd.DataFrame({'CLUSTER': kmeans.labels_})], axis=1)
data_with_cluster.head(10)
Out[12]:
QUANTITYORDERED PRICEEACH ORDERLINENUMBER SALES QTR_ID MONTH_ID YEAR_ID PRODUCTLINE MSRP PRODUCTCODE COUNTRY DEALSIZE CLUSTER
0 30 95.70 2 2871.00 1 2 2003 1 95 0 18 2 2
1 34 81.35 5 2765.90 2 5 2003 1 95 0 6 2 2
2 41 94.74 2 3884.34 3 7 2003 1 95 0 6 1 0
3 45 83.26 6 3746.70 3 8 2003 1 95 0 18 1 0
4 49 100.00 14 5205.27 4 10 2003 1 95 0 18 1 0
5 36 96.66 1 3479.76 4 10 2003 1 95 0 18 1 0
6 29 86.13 9 2497.77 4 11 2003 1 95 0 6 2 3
7 48 100.00 1 5512.32 4 11 2003 1 95 0 11 1 0
8 22 98.57 2 2168.54 4 12 2003 1 95 0 18 2 3
9 41 100.00 14 4708.44 1 1 2004 1 95 0 6 1 1
In [13]:
#Dimension Reduce
from sklearn.decomposition import PCA
pca = PCA(n_components=3)
principal_comp = pca.fit_transform(scaled_data)
pca_df = pd.DataFrame(data=principal_comp, columns=['pca_1', 'pca_2', 'pca_3'])
pca_df = pd.concat([pca_df, pd.DataFrame({'cluster':kmeans.labels_})], axis=1)

#Showing
import plotly.express as px
fig = px.scatter_3d(pca_df, x='pca_1', y='pca_2', z='pca_3', 
                    color='cluster', symbol='cluster', size_max=20, opacity=0.6)
fig.write_html('customer_cluster_map.html', auto_open=True) # For saving 3D Figure
In [14]:
fig.show()
In [ ]: